Kish_HP2

Author

John Kish

IMPORT AND SETUP

Code
library(tidyverse)
library(here)
library(scales)
Import
here::i_am("analysis/Kish_HP2.qmd")
library(here)

df <- read.csv(here("data/Employee_Salaries.csv"))
Data Interprtation
# Department
# Abbreivation/accronym for the department name. Example: ABS for the Alcohol Beverage Services department

# Department_Name
# the department name for all active, permanent employees of Montgomery County, MD (Maryland)


# Division
# The division for each department. That is, a single department can have multiple divisions and focuses.
# Example: ABS has the divisions that include those of ABS 85 Administrative Services
# and ABS 85 Beer Delivery Operations

# Gender
# the Gender of M and F. Assuming that since this is filings with the government, M is to be the legal sex of the individual M for Male and F for Female.

# Base_Salary
# precise base salaries (including down to the cent) for the individual, at the end of the calendar year.
# this is projected data. Should be accurate, but projection allows for greater flexibiliity.


# Overtime_Pay
# amount of overtime pay an employee/individual received for working beyond what they were required to do


# Longevity_Pay
# Longevity pay gives additional compensation for qualifying employees who have worked for a good deal of time. The longer time that is worked, the greater the longevity pay. Specifics about how long are dealt with in-house.


# Grade
# the standing an employee has within a company. The internal codes for all positions are not provided, but we can safely look at M1, M2, M3, etc. as those are clearly levels of management. There are thousands of grades.

# M4-Senior Manager
# M3-Middle Manager
# M2-Middle Manager
# M1-First Level Manager

BACKGROUND

employees of Montgomery County, MD paid in calendar year 2023

PRELIMINARY ANALYSIS

Total Compensation Additional Compensation no missing values using sex and not gender

df$Total_Compensation <- df$Base_Salary + df$Overtime_Pay + df$Longevity_Pay
df$Additional_Compensation <- df$Overtime_Pay + df$Longevity_Pay

df <- rename(df, Sex = "Gender")
# str(df)
# df <- df |> mutate(across(where(is.double), as.integer))

sum(is.na(df))
[1] 0
# No NA Values

How many employees are there? How many of them get overtime, longevity, either, or both?

Question 1 Total Compensation and Overtime

Preliminary Analysis
#each row is a unique employee
Total_Employees <- nrow(df)
Employees_Without_Base <- filter(df, Base_Salary == 0.00) |> nrow()
df <- mutate(df, Employee = row_number())



Employees_Without_Overtime <- filter(df, Overtime_Pay == 0.00)
#4,651 employees got no overtime
Employees_With_Overtime <- anti_join(df, Employees_Without_Overtime)
Percent_Overtime = (nrow(Employees_With_Overtime) / Total_Employees) * 100
# 54% ish employees have overtime


Employees_Without_Longevity <- filter(df, Longevity_Pay == 0.00)
#7,454 employees got no longevity pay
Employees_With_Longevity <- anti_join(df, Employees_Without_Longevity)
Percent_Longevity = (nrow(Employees_With_Longevity) / Total_Employees) * 100
# 27% is of employees have longevity pay



Employees_Without_Over_Long <- filter(df, Overtime_Pay == 0.00 | Longevity_Pay == 0.00)
# 3491 employees have no overtime or longevity pay 
Employees_With_Over_Long <- anti_join(df, Employees_Without_Over_Long)
Percent_Over_Long <- (nrow(Employees_With_Over_Long) / Total_Employees) * 100
# 66% percent of employees have overtime or longevity pay


Employees_Without_Add_Comp <- filter(df, Additional_Compensation == 0.00)
# 3491 employees have no overtime or longevity pay 
Employees_With_Add_Comp <- anti_join(df, Employees_Without_Add_Comp)
Percent_Add_Comp <- (nrow(Employees_With_Add_Comp) / Total_Employees) * 100
# 66% percent of employees have overtime or longevity pay

There are 10291 employees total, where 0 employees have no base salary

Approximately 16.3% have overtime pay, 27.6% have longevity pay, and 16.3% have one or the either.

How much of their total compensation is base + overtime, longevity, either, or both (In terms of percentage?)

Respective Percentage Total Compensation
# addiing to the original df a new column, using mutate, that is the percent of total
# compensation that is each of the relevant factors.

df <- group_by(df, Employee) |>
  mutate(Percent_Over = Overtime_Pay/Total_Compensation)
df$Percent_Over <- round(df$Percent_Over, 3) * 100

df <- group_by(df, Employee) |>
  mutate(Percent_Long = Longevity_Pay/Total_Compensation)
df$Percent_Long <- round(df$Percent_Long, 3) * 100

df <- group_by(df, Employee) |>
  mutate(Percent_Base = Base_Salary/Total_Compensation)
df$Percent_Base <- round(df$Percent_Base, 3) * 100

df <- group_by(df, Employee) |>
  mutate(Percent_Add_Comp = Additional_Compensation/Total_Compensation)
df$Percent_Add_Comp <- round(df$Percent_Add_Comp, 3) * 100

df <- group_by(df, Employee) |>
  mutate(Percent_Over_Long = Base_Salary/Total_Compensation)
df$Percent_Over_Long <- round(df$Percent_Over_Long, 3) * 100

How much of their total compensation is base + overtime, longevity, either, or both?

Preliminary Analysis Plots
#overall
p <- df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
  geom_smooth(method = "lm")
p

Preliminary Analysis Plots
p2 <- Employees_With_Overtime |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime Pay") +
  geom_smooth(method = "lm")
p2

Preliminary Analysis Plots
p3 <- Employees_With_Longevity |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Longevity Pay") +
  geom_smooth(method = "lm")
p3

Preliminary Analysis Plots
p4 <- Employees_With_Over_Long |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime or longevity pay") +
  geom_smooth(method = "lm")
p4

Preliminary Analysis Plots
p5 <- Employees_With_Add_Comp |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
  geom_smooth(method = "lm")
p5

the coeff for these graphs

Preliminary Analysis Coefficients
Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(df))

Over_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Overtime))

Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Longevity))

Over_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Over_Long))

Add_Comp_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Add_Comp))



Base_Total_lm$coefficients
(Intercept) Base_Salary 
3269.028428    1.070269 
Preliminary Analysis Coefficients
Over_Base_Total_lm$coefficients
  (Intercept)   Base_Salary 
-11910.606577      1.330305 
Preliminary Analysis Coefficients
Long_Base_Total_lm$coefficients
  (Intercept)   Base_Salary 
-11115.562891      1.284514 
Preliminary Analysis Coefficients
Over_Long_Base_Total_lm$coefficients
  (Intercept)   Base_Salary 
-32010.085198      1.566415 
Preliminary Analysis Coefficients
Add_Comp_Base_Total_lm$coefficients
 (Intercept)  Base_Salary 
-4803.650406     1.219424 

based on the coefficients for the graphs, those who only get base pay have their total compensation scale up the least with those choosing overtime and longevity pay scaling up the most.

Overtime pay scales slightly larger than longevity pay.

How much of their total compensation is base + overtime, longevity, either, or both (In terms of percentage?)

Preliminary Analysis Plots
p6 <- df |>
  group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Base, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Percent Base", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
  geom_smooth(method = "lm")
p6

Preliminary Analysis Plots
p7 <- df |>
  group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Long, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Percent Long", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
  geom_smooth(method = "lm")
p7

Preliminary Analysis Plots
p8 <- df |>
  group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Over_Long, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Percent Over Long", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
  geom_smooth(method = "lm")
p8

Preliminary Analysis Plots
p9 <- df |>
  group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Add_Comp, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Percent Add Comp", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
  geom_smooth(method = "lm")
p9

the coefficients of the percentage things

Preliminary Analysis Plots
coef(lm(Total_Compensation ~ Percent_Base, 
           data = filter(df)))
 (Intercept) Percent_Base 
  245353.463    -1576.445 
Preliminary Analysis Plots
coef(lm(Total_Compensation ~ Percent_Long, 
           data = filter(df)))
 (Intercept) Percent_Long 
   94062.970     4668.756 
Preliminary Analysis Plots
coef(lm(Total_Compensation ~ Percent_Over_Long, 
           data = filter(df)))
      (Intercept) Percent_Over_Long 
       245353.463         -1576.445 
Preliminary Analysis Plots
coef(lm(Total_Compensation ~ Percent_Add_Comp, 
           data = filter(df)))
     (Intercept) Percent_Add_Comp 
       87709.012         1576.445 

In terms of these slope coefficients, longevity pay has the greatest scaling.

Interestingly, the more percentage base pay you have, the less your total compensation

Question 2 Gender

Import
#overall
p <- df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
  geom_smooth(method = "lm") +
  facet_wrap(~Sex)
p

Import
p2 <- Employees_With_Overtime |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime Pay") +
  geom_smooth(method = "lm") +
  facet_wrap(~Sex)
p2

Import
p3 <- Employees_With_Longevity |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Longevity Pay") +
  geom_smooth(method = "lm") +
  facet_wrap(~Sex)
p3

Import
p4 <- Employees_With_Over_Long |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime or longevity pay") +
  geom_smooth(method = "lm") +
  facet_wrap(~Sex)
p4

Import
p5 <- Employees_With_Add_Comp |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
  geom_smooth(method = "lm") +
  facet_wrap(~Sex)
p5

the coeff for the data, on gender

Import
M_Add_Comp_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Add_Comp, Sex =="M"))
F_Add_Comp_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Add_Comp, Sex == "F"))
M_Over_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Over_Long, Sex =="M"))
F_Over_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Over_Long, Sex == "F"))
M_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Longevity, Sex =="M"))
F_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Longevity, Sex == "F"))
M_Over_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Overtime, Sex =="M"))
F_Over_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(Employees_With_Overtime, Sex == "F"))
M_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(df, Sex =="M"))
F_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary, 
           data = filter(df, Sex == "F"))

the coefficients of the percentage things, on gender

Import
# Male Add Comp
coef(lm(Total_Compensation ~ Percent_Add_Comp, 
           data = filter(df, , Sex =="M")))
     (Intercept) Percent_Add_Comp 
       89199.697         1577.999 
Import
# Female Add Comp
coef(lm(Total_Compensation ~ Percent_Add_Comp, 
           data = filter(df, , Sex =="F")))
     (Intercept) Percent_Add_Comp 
       86761.238         1282.196 
Import
coef(lm(Total_Compensation ~ Percent_Over_Long, 
           data = filter(df, , Sex =="M")))
      (Intercept) Percent_Over_Long 
       246999.627         -1577.999 
Import
coef(lm(Total_Compensation ~ Percent_Over_Long, 
           data = filter(df, , Sex =="F")))
      (Intercept) Percent_Over_Long 
       214980.872         -1282.196 
Import
coef(lm(Total_Compensation ~ Percent_Base, 
           data = filter(df, , Sex =="M")))
 (Intercept) Percent_Base 
  246999.627    -1577.999 
Import
coef(lm(Total_Compensation ~ Percent_Base, 
           data = filter(df, , Sex =="F")))
 (Intercept) Percent_Base 
  214980.872    -1282.196 
Import
coef(lm(Total_Compensation ~ Percent_Long, 
           data = filter(df, , Sex =="M")))
 (Intercept) Percent_Long 
   98329.223     5675.633 
Import
coef(lm(Total_Compensation ~ Percent_Long, 
           data = filter(df, , Sex =="F")))
 (Intercept) Percent_Long 
   88868.458     2432.145 

Question 3 Management

Management Data
Management_1 <- filter(df, Grade == "M1")
Management_2 <- filter(df, Grade == "M2")
Management_3 <- filter(df, Grade == "M3")

Management_df <- bind_rows(Management_1, Management_2, Management_3)





Management_df |> filter(Sex =="M") |> nrow()
[1] 214
Management Data
Management_df |> filter(Sex =="F") |> nrow()
[1] 232
Management Data
Management_df |> filter(Grade == "M1") |> filter(Sex =="M") |> nrow()
[1] 17
Management Data
Management_df |> filter(Grade == "M2") |> filter(Sex =="M") |> nrow()
[1] 73
Management Data
Management_df |> filter(Grade == "M2") |> filter(Sex =="M") |> nrow()
[1] 73
Management Data
Management_df |> filter(Grade == "M1") |> filter(Sex =="F") |> nrow()
[1] 11
Management Data
Management_df |> filter(Grade == "M2") |> filter(Sex =="F") |> nrow()
[1] 74
Management Data
Management_df |> filter(Grade == "M2") |> filter(Sex =="F") |> nrow()
[1] 74
Management Plots
p <- Management_df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
  geom_smooth(method = "lm") +
  facet_wrap(~ Grade)
p

Management Plots
p2 <- Management_df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
  geom_point(aes(color = "red")) +
  scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
  scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
    labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
  geom_smooth(method = "lm") +
  facet_grid(rows = vars(Grade), cols = vars(Sex))
p2

Management Summary
Management_df |>
  group_by(Sex) |>
summarize(mean = mean(Base_Salary))
# A tibble: 2 × 2
  Sex      mean
  <chr>   <dbl>
1 F     150932.
2 M     155995.
Management Summary
Management_df |>
  group_by(Sex) |>
summarize(mean = mean(Total_Compensation))
# A tibble: 2 × 2
  Sex      mean
  <chr>   <dbl>
1 F     150968.
2 M     157012.
Management Summary
df |>
  group_by(Sex) |>
summarize(mean = mean(Base_Salary))
# A tibble: 2 × 2
  Sex     mean
  <chr>  <dbl>
1 F     87498.
2 M     92383.
Management Summary
df |>
  group_by(Sex) |>
summarize(mean = mean(Total_Compensation))
# A tibble: 2 × 2
  Sex      mean
  <chr>   <dbl>
1 F      91559.
2 M     106084.
Management Summary
Management_df |>
filter(Sex == "M") |>
summarize(Total_Compensation)
# A tibble: 214 × 2
   Employee Total_Compensation
      <int>              <dbl>
 1        1            175873 
 2        2            145613.
 3       14            139407.
 4       16            152632.
 5      239            152940 
 6      364            175861.
 7      366            137728 
 8      436            144632.
 9      437            157272.
10      438            134497.
# ℹ 204 more rows
Management Summary
Management_df |>
filter(Sex == "F") |>
summarize(Total_Compensation)
# A tibble: 232 × 2
   Employee Total_Compensation
      <int>              <dbl>
 1        3            136970 
 2      115            166140.
 3      465            152940 
 4      471            152940 
 5      504            152940 
 6      521            177064.
 7      536            178511.
 8      537            175873 
 9      555            175873 
10      579            152940 
# ℹ 222 more rows